Import required libraries
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("stringr")) install.packages("stringr")
if (!require("ggrepel")) install.packages("ggrepel")
if (!require("dplyr")) install.packages("dplyr")
if (!require("patchwork")) install.packages("patchwork")
library(ggplot2)
library(stringr)
library(ggrepel)
library(dplyr)
library(patchwork)
Import dataset with gene counts
genes_list <- read.delim(file.path(project_dir, "GSE107593_raw_reads_BCHRNAseq.txt"), check.names=F)
genes_list[,c(3,10:ncol(genes_list))]
# Read GSE107593_series_matrix.txt
metadata_matrix <- read.delim(file.path(project_dir, "Dataset copy, untouched", "GSE107593_series_matrix.txt"), check.names=F, skip = 25)
metadata_matrix_2 <- data.frame(t(metadata_matrix[,2:ncol(metadata_matrix)]))
colnames(metadata_matrix_2) <- metadata_matrix[,1]
# Obtain patient ids
patient_ids <- unique(str_replace(metadata_matrix_2[,12], "subject: ",""))
for (el in patient_ids){
cat(paste0(el,"\n"))
}
157
877
1057
1077
1192
1214
8854
8855
8874
8878
8879
8881
# Obtain sampling locations
location <- unique(str_replace(metadata_matrix_2[,11], "location: ",""))
for (el in sort(location)){
cat(paste0(el,"\n"))
}
Ascending
Descending
Large
Rectum
Sigmoid
Tranverse
# Prepare data for PCA
genes_list_for_pca <- t(genes_list[,c(10:ncol(genes_list))])
colnames(genes_list_for_pca) <- genes_list[,3]
# PCA
pca <- prcomp(genes_list_for_pca, scale=F)
# Screeplot
pca_metrics <- as.data.frame(t(summary(pca)$importance))
colnames(pca_metrics) <- c("Standard_deviation", "Proportion_of_Variance", "Cumulative_Proportion")
pca_metrics$PCs <- as.numeric(substr(rownames(pca_metrics), 3, length(rownames(pca_metrics))))
ggplot(data = pca_metrics[1:10,], aes(x=PCs, y=Proportion_of_Variance)) +
geom_line(color="blue") +
geom_point() +
scale_x_continuous(breaks = c(1:10)) +
labs(title="PCA - Variance explained per principal component")+
xlab("Principal component")+
ylab("Proportion of variance explained")+
theme_light()+
theme(plot.title = element_text(size=12),
axis.title.x = element_text(size = 11),
axis.title.y = element_text(size = 11),
text = element_text(size=11),
aspect.ratio = 1/1.5)
# Create dataframe to contain relevant metadata in appropriate format
df_samples_treatments <- data.frame(row.names = rownames(metadata_matrix_2))
# Get sample name and inflammation status
df_samples_treatments$sample_inflamation <- ""
for (row in (1:nrow(metadata_matrix_2))){
df_samples_treatments[row, ncol(df_samples_treatments)] <- str_split(
rownames(df_samples_treatments)[row],
"Ulcerative colitis Colon Biopsy ",
simplify = T)[2]}
# Get sample name
df_samples_treatments$sample <- ""
for (row in (1:nrow(df_samples_treatments))){
df_samples_treatments[row, ncol(df_samples_treatments)] <- gsub(" [^ ]*$", "", df_samples_treatments[row, ncol(df_samples_treatments)-1])
}
# Get inflammation status
df_samples_treatments$inflammation <- ""
for (row in (1:nrow(df_samples_treatments))){
df_samples_treatments[row, ncol(df_samples_treatments)] <- gsub(".* ", "", df_samples_treatments[row, ncol(df_samples_treatments)-2])
}
# Get sample location
df_samples_treatments$location <- ""
for (row in (1:nrow(df_samples_treatments))){
df_samples_treatments[row, "location"] <- gsub("location: ", "", metadata_matrix_2[row, 11])
}
# Get sampling hospital
df_samples_treatments$hospital <- ""
for (row in (1:nrow(df_samples_treatments))){
df_samples_treatments[row, "hospital"] <- gsub("site: ", "", metadata_matrix_2[row, 10])
}
# Create dataframe out of the pca scores
pca_scores <- data.frame(pca$x)[1:10]
# Get the patient id in the pca scores dataframe
pca_scores$patient <- ""
for (row in (1:nrow(pca_scores))){
for (id_patient in 1:length(patient_ids)){
if(!is.na(str_extract(rownames(pca_scores[row,]), patient_ids[id_patient]))){
pca_scores[row, ncol(pca_scores)] <- str_extract(rownames(pca_scores[row,]), patient_ids[id_patient])
}
}
}
# Get the sample id in the PCA scores dataframe
pca_scores$sample <- rownames(pca_scores)
# Get the inflammation status of each sample
pca_scores$inflammation <- ""
for(row_scores in 1:nrow(pca_scores)){
for(row_df in 1:nrow(df_samples_treatments)){
if(pca_scores[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
pca_scores[row_scores, "inflammation"] <- df_samples_treatments[row_df, "inflammation"]
}
}
}
# Get the sample location in the PCA scores dataframe
pca_scores$location <- ""
for(row_scores in 1:nrow(pca_scores)){
for(row_df in 1:nrow(df_samples_treatments)){
if(pca_scores[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
pca_scores[row_scores, "location"] <- df_samples_treatments[row_df, "location"]
}
}
}
# Get the sampling hospital in the PCA scores dataframe
pca_scores$hospital <- ""
for(row_scores in 1:nrow(pca_scores)){
for(row_df in 1:nrow(df_samples_treatments)){
if(pca_scores[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
pca_scores[row_scores, "hospital"] <- df_samples_treatments[row_df, "hospital"]
}
}
}
p1 <- ggplot(pca_scores, aes(x=PC1, y=PC2, colour = inflammation)) +
geom_point(size=2)+
labs(title = "Inflammation status")+
theme_light()+
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)+
guides(colour=guide_legend(ncol=1))
p2 <- ggplot(pca_scores, aes(x=PC1, y=PC2, colour = location)) +
geom_point(size=2)+
labs(title = "Sample location")+
theme_light()+
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)+
guides(colour=guide_legend(ncol=2))
p1+p2+plot_annotation(title = "PCA scores according to sample metadata - inflammation status and sample location",
subtitle = "Counts not normalized",
theme = theme(plot.title = element_text(hjust = 0.5, vjust = 3),
plot.subtitle = element_text(hjust = 0.5, vjust = 3)))
# Create df for normalization:
genes_list_lognorm <- data.frame(matrix(NA, nrow = nrow(genes_list), ncol = ncol(genes_list)))
# Columns before counts
for (col in 1:9){
genes_list_lognorm[[col]] <- genes_list[[col]]
}
# Column names
colnames(genes_list_lognorm) <- colnames(genes_list)
for (i in 10:ncol(genes_list)) {
genes_list_lognorm[, i] <- log2((genes_list[, i] / colSums(genes_list[i]) * 1000000)+8)
}
genes_list_lognorm[,c(3,10:ncol(genes_list))]
# Create dataframe for PCA of lognormalized samples
genes_list_lognorm_for_pca <- t(genes_list_lognorm[,10:ncol(genes_list_lognorm)])
colnames(genes_list_lognorm_for_pca) <- rownames(genes_list_lognorm)
# PCA of lognorm counts
pca_lognorm <- prcomp(genes_list_lognorm_for_pca)
# Screeplot
pca_lognorm_metrics <- as.data.frame(t(summary(pca_lognorm)$importance))
colnames(pca_lognorm_metrics) <- c("Standard_deviation", "Proportion_of_Variance", "Cumulative_Proportion")
pca_lognorm_metrics$PCs <- as.numeric(substr(rownames(pca_lognorm_metrics), 3, length(rownames(pca_lognorm_metrics))))
ggplot(data = pca_lognorm_metrics[1:10,], aes(x=PCs, y=Proportion_of_Variance)) +
geom_line(color="blue") +
geom_point() +
scale_x_continuous(breaks = c(1:10)) +
labs(title="PCA of lognormalized counts - Variance explained per principal component")+
xlab("Principal component")+
ylab("Proportion of variance explained")+
theme_light()+
theme(plot.title = element_text(size=12),
axis.title.x = element_text(size = 11),
axis.title.y = element_text(size = 11),
text = element_text(size=11),
aspect.ratio = 1/1.5)
# Create pca scores dataframe for lognormalized counts pca
pca_scores_lognorm <- data.frame(pca_lognorm$x[,1:10])
pca_scores_lognorm$patient <- pca_scores$patient
pca_scores_lognorm$sample <- pca_scores$sample
pca_scores_lognorm$inflammation <- pca_scores$inflammation
pca_scores_lognorm$location <- pca_scores$location
pca_scores_lognorm$hospital <- pca_scores$hospital
p1 <- ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = inflammation)) +
geom_point(size=2)+
labs(title = "Inflammation status")+
theme_light()+
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)+
guides(colour=guide_legend(ncol=1))
p2 <- ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = location)) +
geom_point(size=2)+
labs(title = "Sample location")+
theme_light()+
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)+
guides(colour=guide_legend(ncol=2))
p1+p2+plot_annotation(title = "PCA scores according to sample metadata - inflammation status and sample location",
subtitle = "Counts log-normalized",
theme = theme(plot.title = element_text(hjust = 0.5, vjust = 3),
plot.subtitle = element_text(hjust = 0.5, vjust = 3)))
ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = patient, shape = inflammation, label=patient)) +
geom_point(size=3)+
geom_text_repel(nudge_x = 2)+
labs(title = "PCA scores by patient ID and inflammation status")+
theme_light()+
theme(plot.title = element_text(hjust=0.5))
# Extract the rotations for both pcas (raw and lognormalized)
pca_rotations <- data.frame(pca_lognorm$rotation[,1:10])
pca_lognorm_rotations <- data.frame(pca_lognorm$rotation[,1:10])
# Add column with gene names to rotations dataframes
pca_rotations$gene_name <- genes_list$gene_name
pca_lognorm_rotations$gene_name <- genes_list$gene_name
# Show genes with higher rotation values for pca (raw counts)
top_pos_pca_lognorm <- pca_lognorm_rotations %>% arrange(desc(PC1)) %>% select(c(PC1, gene_name))
top20_pos_pca_lognorm <- top_pos_pca_lognorm[1:20,]
top20_pos_pca_lognorm$gene_name
[1] "IGHG1" "DUOX2" "IGHG3" "PI3" "DUOXA2" "IGHG2"
[7] "LCN2" "CHI3L1" "CXCL1" "IGHG4" "NOS2" "REG1A"
[13] "SLC6A14" "MMP3" "C3" "DMBT1" "IGHV4-34" "REG4"
[19] "PLA2G2A" "IGFBP5"
# DO
# Show genes with more negative rotation values for pca (raw counts)
top_neg_pca_lognorm <- pca_lognorm_rotations %>% arrange(PC1) %>% select(c(PC1, gene_name))
top20_neg_pca_lognorm <- top_neg_pca_lognorm[1:20,]
top20_neg_pca_lognorm$gene_name
[1] "AQP8" "HMGCS2" "SLC26A2" "CA1" "ANPEP" "PCK1"
[7] "CHP2" "B4GALNT2" "GUCA2A" "ADH1C" "PADI2" "SLC26A3"
[13] "ABCG2" "FABP1" "CA2" "LYPD8" "CKB" "SLC51A"
[19] "UGT2A3" "SLC9A3"